In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
sns.set_context("talk")
In [2]:
df = pd.read_csv('raw/2016-17-ClassCentral-Survey-data-noUserText.csv', decimal=',', encoding = "ISO-8859-1")
In [3]:
df['Which region of the world are you in?'].value_counts()
Out[3]:
In [4]:
target_name = 'Latin America'
mask_latin_america = (df['Which region of the world are you in?'] == 'Central or South America') | \
(df['Which region of the world are you in?'] =='Mexico')
In [5]:
def binary_compare_categorical_barh(mask, feature, df=df,
target_name='target', nontarget_name='Other',
split_name='visitor', answer='answer'):
"""Split dataframe into two based on mask
Draw horizontal barcharts for each category item for both masked and unmasked object"""
target = df[mask]
nontarget = df[~mask]
target_size, nontarget_size = len(target), len(nontarget)
res_target = target[feature].value_counts()/target_size*100
res_nontarget = nontarget[feature].value_counts()/nontarget_size*100
result = pd.DataFrame({target_name: res_target, nontarget_name: res_nontarget})
result[answer] = result.index
res_df = pd.melt(result, id_vars=answer, var_name=split_name, value_name='percentage')
print(res_df)
sns.factorplot(x='percentage', y=answer, hue=split_name, data=res_df, kind='bar', orient='h', size=6, aspect=2)
plt.title(feature)
sns.despine(left=True, bottom=True)
plt.show()
return
In [6]:
def binary_compare_multi_select_categorical_barh(df, target, target_name, question, selectors, nontarget_name = 'Others'):
"""draw a barchart for Survey results on a question that allows to select multiple categories
df: dataframe to use
target: selection of rows based on column values
question: the question you want to analyse
selectors: list of df column containing the selectors (values 0/1)"""
size = {}
target_df = df[target]
nontarget_df = df[~target]
size[target_name], size[nontarget_name] = len(target_df), len(nontarget_df)
print(size)
graph_targetdata = target_df.loc[:, selectors]
graph_targetdata['target'] = target_name
graph_nontargetdata = nontarget_df.loc[:, selectors]
graph_nontargetdata['target'] = nontarget_name
graph_data = pd.concat([graph_targetdata, graph_nontargetdata])
melted = pd.melt(graph_data, id_vars='target', var_name='select', value_name='percentage')
grouped = melted.groupby(['target', 'select'], as_index=False).sum()
#print(size[grouped['target']])
grouped.percentage = grouped.percentage/grouped['target'].map(size)*100 # make it percentage of total
grouped['select'] = grouped['select'].apply(lambda x: x.split(": ")[1]) # remove prefix from string
print(grouped)
sns.factorplot(x='percentage', y='select', hue='target', data=grouped, kind='bar', orient='h', size=6, aspect=2)
sns.plt.title(question)
sns.despine(left=True, bottom=True)
sns.plt.show()
In [7]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America', feature='How familiar are you with MOOCs?')
Latin american visitors are more familiar with MOOCs than other visitors of Class Central
In [8]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America', feature='How important is the ability to earn a certificate when you complete a MOOC?')
Latin American visitors find it more important than other visitors of Class Central
In [9]:
reasons = ['Reasons: Learning skills for current career', 'Reasons: Learning skills for new career',
'Reasons: School credit', 'Reasons: Personal interest', 'Reasons: Access to reference materials']
binary_compare_multi_select_categorical_barh(df, target=mask_latin_america, target_name='Latin America',
question='Which of the following are important reasons for you to take MOOCs?',
selectors=reasons)
Latin Americans indicate more reasons than others to follow MOOCs, personal interest and learning skills for current career being the most important
In [10]:
decisions = ['Decide: Topic/Subject', 'Decide: Instructor', 'Decide: Institution/university',
'Decide: Platform', 'Decide: Ratings', 'Decide: Others recommendations']
binary_compare_multi_select_categorical_barh(df, target=mask_latin_america, target_name='Latin America',
question='Which are the most important factors in deciding which MOOC to take?',
selectors=decisions)
The topic/subject and the Institution are the most appealing reasons to follow a MOOC
In [11]:
aspects = ['Aspects: Browsing discussion forums',
'Aspects: Actively contributing to discussion forums',
'Aspects: Connecting with other learners in the course environment',
'Aspects: Connecting with learners outside the course environment',
'Aspects: Taking the course with other people you know (friends, colleagues, etc.)']
binary_compare_multi_select_categorical_barh(df, target=mask_latin_america, target_name='Latin America',
question='Which of the following are important aspects of the MOOC experience to you?',
selectors=aspects)
Connecting with other students is more important for Latin American students than for others
In [12]:
benefits = ['Benefit: Have not taken MOOCs',
'Benefit: Not Really',
'Benefit: School credit towards a degree',
'Benefit: Promotion at current organization',
'Benefit: Higher performance evaluation at current job',
'Benefit: Helped me get a new job in the same field',
'Benefit: Helped me get a new job in a different field']
binary_compare_multi_select_categorical_barh(df, target=mask_latin_america, target_name='Latin America',
question='Have you received any tangible benefits from taking MOOCs?',
selectors=benefits)
Higher performance evaluation at current job is perceived as benefit by 17% of the Latin American visitors
In [13]:
pays = ['Pay: The topic/subject',
'Pay: The institution/university offering the MOOC',
'Pay: The instructor/professor',
'Pay: The MOOC platform being used',
'Pay: A multi-course certification that the MOOC is a part of']
binary_compare_multi_select_categorical_barh(df, target=mask_latin_america, target_name='Latin America',
question='Which of the following have a strong impact on your willingness to pay for a MOOC certificate?',
selectors=pays)
The institution/university offering the MOOC is a very important argument for Latin Americans to take MOOCs
In [14]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America', feature='# MOOCs Started')
38% of the Latin American visitors have started 4 or more MOOCs
In [15]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America', feature='# MOOCs Finished')
Latin American Class Central visitors are more experienced compared to the rest of the world, they have in general started and finished more MOOCs
In [16]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America', feature='When did you first start taking MOOCs?')
They also started earlier with MOOCs
In [17]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America', feature='How willing are you to pay for a certificate for a MOOC?')
Latin American Class Central visitor show a higher willingness to pay for a MOOC certificate
In [18]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America',
feature='How much do you think employers value MOOC certificates?')
In [19]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America',
feature='What is your level of formal education?')
More than half of LAtin American visitors have a graduate degree
In [20]:
binary_compare_categorical_barh(mask=mask_latin_america,
target_name='Latin America',
feature='What is your age range?')
In [ ]: